In this we will be taking an advanced look at a cleanup that the Rubbish team did.
Project uses R vizualization and Python scripting together
Note: DAYS = Baseline audit, monday preparty, SG day 1, and SG day 2
knitr::opts_chunk$set(warning=FALSE, message=FALSE)
####### ALL LIBRARIES USED ###########
###### DATA MANIPULATION ######
options(stringsAsFactors = FALSE)
library(dplyr) # for main data manipulations
library(reticulate) # for python script
library(reshape2) # for dcast()
library(lubridate) # for hours()
source_python("euclidean_script.py") # python script
###### GRAPHING ######
library(RColorBrewer) # for colors in graphs
library(ggplot2) # for lollipop graph
library(hrbrthemes) # for ggplot theme
library(sunburstR) # for sunburst graph
library(streamgraph) # for stream graph
library(heatmaply) # for heatmap graph
library(DT) # for data table
library(leaflet) # for leaflet map
library(htmltools) # for leaflet map
######################################
# setup to get all types
collection = c('trashCan', 'recyclingCan', 'tobaccoAshCan')
litter = c('paper', 'tobacco', 'unknown', 'plastic', 'food', 'glass')
# icon png files
icons = c("cigarette.png", "cigarette.png", "plastic-bottle.png", "cigarette.png", "cigarette.png", "cigarette.png")
# load in cleaned raw data
raw <- read.csv('clean/clean_rubbish.csv') %>%
mutate( # change days from shorthand to full
day = factor(ifelse(day =="Sun", "Sunday",
ifelse(day =="Mon", "Monday",
ifelse(day == "Tue", "Tuesday", "Wednesday"))),
levels = c("Sunday", "Monday", "Tuesday", "Wednesday"))
)# prepping data for plot
plot_data<- raw %>%
subset(city == 'Redwood City') %>%
subset(is_litter == 1) %>%
group_by(rubbishType, day) %>%
summarise(
num_litter = n()
) %>%
group_by(day) %>%
arrange(-num_litter) %>%
ungroup() %>%
arrange(day, num_litter) %>%
mutate(order = row_number())
# plotting lollipop graphs
plot_data %>%
ggplot() +
geom_segment( aes(x=order, xend=order, y=0, yend=num_litter), color="pink", size = 1.5) +
geom_point( aes(x=order, y=num_litter, color="pink"), size=5 ) +
coord_flip()+
theme_ipsum() +
theme(
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
legend.position = "none",
panel.border = element_blank(),
panel.spacing = unit(0.1, "lines"),
strip.text.x = element_text(size = 15)
) +
xlab("") +
ylab("Pieces of Litter Collected") +
ggtitle("Total Litter Collected by Rubbish Team") +
facet_wrap(~day, ncol=1, scale="free_y") +
scale_x_continuous(
breaks = plot_data$order,
labels = plot_data$rubbishType)# CHANGING IMAGES
#library("ggimage")
# image links must be in dataframe column
#geom_image(aes(image=image), size=.05)GRAPH ABOVE:
NEED:
Collection Objects would be Trash Cans, Recycling Cans, and Tobacco Ash Cans.
explain clustering
# clustering data points with python script
clustered_data <- raw %>%
subset(city == 'Redwood City') %>% # this is a different area
get_euclidean(collection, litter) # python function
#########################
# CLUSTERING PREFORMED
# getting details on clustered data
clusters <- clustered_data %>%
subset(is_litter == 1) %>% # only want objects
group_by(closest_cent, day) %>%
summarise(
cent_type = max(cent_type),
num_litter = n(),
mean_dist = round(mean(distance),2),
median_dist = round(median(distance),2),
sd_dist = round(sd(distance),2),
max_dist = round(max(distance),2),
lat = mean(lat),
long = mean(long)
) %>%
arrange(-num_litter)
# getting data for sunburst plot
clustered_data %>%
subset(is_litter == 1) %>%
group_by(cent_type, day, rubbishType) %>%
summarise(
num_litter = n()
) %>%
mutate( # path is needed for plot
path = paste(cent_type, rubbishType, day, sep="-")
) %>%
subset(select = c(path, num_litter)) %>%
# plotting suburst graph
sunburst(legend=TRUE,
colors = rev(brewer.pal(9, "RdPu")[4:9]),
count = TRUE,
breadcrumb = list(w=150,h=50, t=30),
sumNodes = TRUE
)This shows the breakdown of clusterings, from Collection Object Type, to litter type, then to the day of the week it was collected.
This graph does a great job of breaking down the desparities of Trash Cans and Recycling Cans, while also showing similarities!
To see if the Rubbish team had an effect on the clustering, we should look where they were focused.
get_hour <- function(time){
# function used to save space
return (hour(as.POSIXct(time, format="%Y-%m-%d %H:%M:%S")))
}
# getting plot data for streamgraph
plot_data <- clustered_data %>%
subset(is_litter==1) %>% # only want objects
mutate( # getting hours of days, and making them concurrent
time = ifelse(day=="Sunday", get_hour(time),
ifelse(day=="Monday", get_hour(time)+24,
ifelse(day=="Tuesday", get_hour(time)+48,
get_hour(time)+72))),
# make graph prettier
closest_cent = paste(ifelse(cent_type == "trashCan", "Trash ID:",
ifelse(cent_type == "recyclingCan", "Recyc ID:", "Ash ID:")),
ifelse(closest_cent<10, paste("0",closest_cent, sep=""), closest_cent),
"| Litter Collected",
sep= " ")
) %>%
group_by(closest_cent,time) %>%
summarise(
num_litter = n()
) %>%
arrange(-num_litter)
# plotting Streamgraph
plot_data %>%
streamgraph(
"closest_cent", "num_litter", "time",
interpolate="step", #offset = "zero",
scale = "continuous", width="800px", height="400px"
) %>%
sg_legend(show=TRUE, label="Collection ID: ") %>%
sg_fill_manual(brewer.pal(9, "RdPu")[c(3:9)]) %>%
sg_annotate("Collected Litter per Hour", "300px", "400px")This graph represents the amount of litter collected per hour, for each collection object.
This graph shows two things:
This is a multi-day event, which means is why we see the sharp spikes leading to 0 litter being collected for some hours.
The litter team had stayed by certain collection objects for longer than others.
To see this, we need compare the mean distances for each collection object. It will be better to view this for each individual day, as we know there was more foot traffic on Tuesday and Wednesday.
# prepping data for heatmap plot
plot_data <- clusters %>%
mutate(
closest_cent = paste(ifelse(cent_type == "trashCan", "Trash:",
ifelse(cent_type == "recyclingCan", "Recyc", "Ash")),
ifelse(closest_cent<10, paste("0",closest_cent, sep=""), closest_cent),
sep= " ")
) %>%
subset(select = c(day, closest_cent, mean_dist)) %>%
dcast(closest_cent ~ day, value.var = "mean_dist") %>%
mutate( # chaning na values to 0
Sunday = ifelse(is.na(Sunday),0,Sunday),
Monday = ifelse(is.na(Monday),0,Monday),
Tuesday = ifelse(is.na(Tuesday),0,Tuesday),
Wednesday = ifelse(is.na(Wednesday),0,Wednesday)
)
# changing rownames to centroid id
rownames(plot_data) <- plot_data[,"closest_cent"]
# making dataframe into matrix
plot_data <- plot_data %>%
subset(select = -c(closest_cent)) %>%
as.matrix()
# colors for graph
colors <- brewer.pal(9, "RdPu")[c(1, 6:9)]
colors[1] <- "#ffffff"
# plotting heatmap
plot_data %>%
heatmaply(
plot_method = "plotly",
colors = colorRampPalette(colors),
dendogram = "both",
show_dendrogram = c(FALSE, FALSE),
label_names = c("Day", "Collection ID", "Mean Distance"),
grid_color = "white",
main = "Mean Distance of Litter from the (Trash / Recyclying / Ash) Can",
#ylab = "Collection Objects (ID)",
xlab = "A distance of 0 means there are no objects around the Collection Object.",
key.title = "meters",
showticklabels = c(TRUE, TRUE),
column_text_angle = 0,
colorbar_len = .8,
grid_gap = 1
) %>%
layout(width=800)This graph shows that ||| PERCENTAGE ||| of the litter has a mean distance of under 20 meters to the closest applicable Collection Object.
Well here is a searchable data table to view the clusteringss yourself, along with some basic statistics for each!
# getting data for data table
clustered_data %>%
subset(obj_id != -1) %>% # only want objects
group_by(closest_cent) %>%
summarise(
cent_type = max(cent_type),
num_litter = n(),
mean_dist = round(mean(distance),2),
median_dist = round(median(distance),2),
sd_dist = round(sd(distance),2),
max_dist = round(max(distance),2)
) %>%
arrange(-num_litter) %>%
rename( # renaming columns
"Collection ID" = closest_cent,
"Collection Type" = cent_type,
"Number of Litter" = num_litter,
"Mean Distance (m)" = mean_dist,
"Median Distance (m)" = median_dist,
"Standard Deviation (m)" = sd_dist,
"Max Distance (m)" = max_dist
) %>%
# plotting data table
datatable()We have looked at all the numbers, lets try and see what this actually means in regards to real life!
colors <- colorNumeric(
# function for color of collection types
palette = c('#134a47', '#2859b8', 'green'),
domain = c(1,2,3)
)
labs <- lapply(seq(nrow(clusters)), function(i) {
# function for mouseover of circles
paste0( 'Collection Type: ', clusters[i, "cent_type"], '<p>Number of Litter Objects: ',
clusters[i, "num_litter"], '</p>Mean Distance: ',
round(clusters[i, "mean_dist"],2),' meters<p>Max Distance: ',
round(clusters[i, "max_dist"],2), ' meters</p>' )
})
html_legend <- "<img src='open-trash-can.png' style='width:20px;height:20px;'> Trash & Recycling<br/>
<img src='open-trash-can.png' style='width:20px;height:20px;'> Paper<br/>
<img src='cigarette.png' style='width:20px;height:20px;'> Tobacco<br/>
<img src='open-trash-can.png' style='width:20px;height:20px;'> Unknown<br/>
<img src='open-trash-can.png' style='width:20px;height:20px;'> Plastic<br/>
<img src='open-trash-can.png' style='width:20px;height:20px;'> Food<br/>
<img src='open-trash-can.png' style='width:20px;height:20px;'> Glass"
map <- clusters %>%
mutate(
cent_type = ifelse(cent_type == 'trashCan', 1,
ifelse(cent_type == 'recyclingCan', 2, 3))
) %>%
leaflet(width = "100%") %>%
setView(lng = -122.2298, lat = 37.48650, zoom = 18) %>%
addProviderTiles(providers$CartoDB.Positron) %>%
addCircles(
~long, ~lat,
radius = ~mean_dist,
color = ~colors(cent_type),
opacity = ~num_litter/357,
label = lapply(labs, HTML),
group = "Collections"
) %>%
addMarkers(lng = ~long, lat = ~lat,
icon = makeIcon("open-trash-can.png", "open-trash-can.png", 15,15),
group = "Collections"
) %>%
addLayersControl(
overlayGroups = c("Collections", litter),
options = layersControlOptions(collapsed = FALSE)
) %>%
addControl(html = html_legend, position = "bottomleft")
for (i in 1:length(litter)){
map <- map %>%
addMarkers(data = clustered_data %>%
subset(rubbishType == litter[[i]]),
lng = ~long, lat = ~lat,
icon = makeIcon(icons[[i]], icons[[i]], 7,7),
group = litter[[i]]
)
}
mapffffffffffffffffffffffffffffff
map looks disgusting
hexbin?
clusters based on polygons?
A work by Alexander Kahanek x Rubbish, co.